In [1]:
%pylab
%matplotlib inline


Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib

In [2]:
cd ..


/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-work

In [3]:
import sys
import numpy as np
import skimage
import cv2
import sklearn
import imp

In [4]:
from holoviews import *


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [5]:
import neukrill_net.utils
import neukrill_net.highlevelfeatures

In [6]:
import time

In [7]:
settings = neukrill_net.utils.Settings('settings.json')

In [8]:
X,y = settings.flattened_train_paths(settings.classes)

In [9]:
pkl_names = ['pftas.pkl','contourhistogram.pkl','contourmoments.pkl','haralick.pkl']

In [11]:
t0 = time.time()
hlf = []
XF_list = []
for pkl_name in pkl_names:
    tmp = sklearn.externals.joblib.load('cache/'+pkl_name)
    hlf += [tmp[0]]
    XF_list += [tmp[1]]
print("Loading features took {}".format(time.time()-t0))


Loading features took 0.176540851593

In [12]:
XF = np.concatenate(XF_list,2)

In [13]:
XF.shape


Out[13]:
(1, 30336, 366)

In [14]:
XF[0,0,:]


Out[14]:
array([  9.27835052e-02,   8.24742268e-02,   1.28865979e-01,
         2.57731959e-01,   2.21649485e-01,   1.23711340e-01,
         5.67010309e-02,   1.03092784e-02,   2.57731959e-02,
         9.27835052e-02,   8.24742268e-02,   1.28865979e-01,
         2.57731959e-01,   2.21649485e-01,   1.23711340e-01,
         5.67010309e-02,   1.03092784e-02,   2.57731959e-02,
         7.88043478e-02,   8.69565217e-02,   1.35869565e-01,
         1.87500000e-01,   1.19565217e-01,   8.15217391e-02,
         1.22282609e-01,   1.03260870e-01,   8.42391304e-02,
         9.22432432e-01,   2.94594595e-02,   1.89189189e-02,
         1.70270270e-02,   1.00000000e-02,   1.62162162e-03,
         5.40540541e-04,   0.00000000e+00,   0.00000000e+00,
         9.22432432e-01,   2.94594595e-02,   1.89189189e-02,
         1.70270270e-02,   1.00000000e-02,   1.62162162e-03,
         5.40540541e-04,   0.00000000e+00,   0.00000000e+00,
         8.10550199e-01,   8.11117413e-02,   4.42427680e-02,
         3.14804311e-02,   1.95689166e-02,   1.04934770e-02,
         1.70164492e-03,   8.50822462e-04,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.03626940e-02,   0.00000000e+00,   1.55440411e-02,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   1.03626940e-02,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.03626940e-02,   5.18134702e-03,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   5.18134702e-03,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   1.03626940e-02,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   1.03626940e-02,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   5.18134702e-03,   0.00000000e+00,
         1.55440411e-02,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   5.18134702e-03,
         1.03626940e-02,   2.59067360e-02,   1.55440411e-02,
         1.55440411e-02,   1.03626940e-02,   1.03626940e-02,
         0.00000000e+00,   1.03626940e-02,   1.03626940e-02,
         1.03626940e-02,   3.10880821e-02,   2.07253881e-02,
         1.55440411e-02,   2.07253881e-02,   1.55440411e-02,
         2.07253881e-02,   1.03626940e-02,   2.07253881e-02,
         3.10880821e-02,   2.07253881e-02,   1.55440411e-02,
         1.55440411e-02,   2.07253881e-02,   2.07253881e-02,
         2.59067360e-02,   1.55440411e-02,   2.59067360e-02,
         3.62694301e-02,   5.18134721e-02,   5.18134721e-02,
         3.62694301e-02,   3.10880821e-02,   2.59067360e-02,
         2.07253881e-02,   2.07253881e-02,   2.07253881e-02,
         4.14507762e-02,   4.14507762e-02,   9.32642519e-02,
         1.65803105e-01,   3.52331609e-01,   8.96373034e-01,
         3.26424867e-01,   2.55193896e-01,   2.04862918e+00,
         1.44272580e+00,   2.12646011e+00,  -4.62433192e+00,
         3.15379871e+00,   3.07003271e+04,   1.39122141e+05,
         2.54433417e+05,   2.35574674e-01,   8.21080723e+06,
        -1.02133457e+05,   4.17133345e+04,   3.20081449e-01,
         8.22184495e+06,  -4.12477274e-02,   2.76398107e+03,
         1.50253777e+04,   2.12090229e-02,   6.06816516e-03,
         3.34214333e+05,   1.16103130e+07,   3.61000000e+02,
         1.04675000e+04,   1.95815874e+05,   7.90824091e-02,
         5.61860172e-02,   8.67950000e+03,   2.50394000e+05,
         7.29116057e+06,   5.92909729e-01,   1.17428038e+02,
         8.37376309e-01,   3.60378916e+02,   8.37396275e-01,
         5.04056774e+02,   1.32408763e+03,   1.77634694e+00,
         2.17408130e+00,   2.42847737e-03,   1.54530917e+00,
        -2.79362396e-01,   7.10802264e-01,   8.44503234e-03,
         8.02833026e+01,   1.08540997e-01,   6.01075856e+00,
         7.49707356e-03,   9.91907566e-02,   5.62402684e+01,
         7.00702220e-02,   9.22398670e-02,   4.07591384e-05,
         1.03418161e-01,   5.27140192e-02,   4.48146178e-02])

Naive Bayes


In [15]:
import sklearn.naive_bayes

In [16]:
clf = sklearn.naive_bayes.GaussianNB()

In [17]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=4.97276306152
Accuracy=0.185917721519
Logloss=26.1907140911

Reduce with Feature selection


In [19]:
X_new = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=45).fit_transform(XF.squeeze(0), y)

In [20]:
my_X = X_new
clf = sklearn.naive_bayes.GaussianNB()

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=0.0727601051331
Accuracy=0.308742088608
Logloss=10.6894260686

Random Forest

On original


In [21]:
import sklearn.ensemble

In [23]:
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=38.6595170498
Accuracy=0.525843881857
Logloss=1.91356867834

This is similar to just the Contour Moments and Haralick features

On reduced


In [24]:
my_X = X_new

clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=18.7048509121
Accuracy=0.502043776371
Logloss=1.9520887141

Does slightly worse with fewer features.

Maybe it was too few?


In [25]:
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=24.9658739567
Accuracy=0.529008438819
Logloss=1.85792798987

In [27]:
# Extra trees

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

clf = sklearn.ensemble.ExtraTreesClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=41.3399438858
Accuracy=0.495253164557
Logloss=2.0229527739

In [28]:
# Adaboost trees

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

clf = sklearn.ensemble.AdaBoostClassifier(n_estimators=1000, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=405.716595173
Accuracy=0.174709915612
Logloss=4.776136176

Clustering

Try DBSCAN


In [29]:
clusterer = sklearn.cluster.DBSCAN()

In [31]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))

In [34]:
cluster_pred


Out[34]:
array([-1, -1, -1, ..., -1, -1, -1])

It's no good.

Try KMeans


In [35]:
clusterer = sklearn.cluster.MiniBatchKMeans(n_clusters=11, max_iter=100, batch_size=100,
                                            compute_labels=True, random_state=42)

In [36]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))


Time=0.430577993393

In [38]:
cluster_pred


Out[38]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int32)

In [39]:
import scipy.stats

In [43]:
n_classes = len(np.unique(y))
y_ = np.array(y)
class_clusters = np.ones((n_classes)) * -1

for class_index in range(n_classes):
    li = (y_ == class_index)
    class_clusters[class_index] = scipy.stats.mode(cluster_pred[li])[0]

In [44]:
class_clusters


Out[44]:
array([ 3.,  0.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  5.,  3.,  5.,
        3.,  3.,  3.,  0.,  3.,  3.,  3.,  3.,  3.,  3.,  0.,  3.,  3.,
        3.,  3.,  7.,  3.,  3.,  8.,  3.,  3.,  3.,  3.,  3.,  3.,  1.,
        3.,  3.,  3.,  3.,  3.,  3.,  5.,  0.,  0.,  3.,  3.,  4.,  1.,
        4.,  3.,  3.,  3.,  1.,  1.,  1.,  3.,  1.,  3.,  1.,  1.,  1.,
        4.,  3.,  3.,  0.,  3.,  3.,  4.,  7.,  0.,  1.,  7.,  3.,  3.,
        3.,  3.,  5.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  0.,  3.,  3.,
        0.,  3.,  3.,  1.,  3.,  3.,  1.,  3.,  0.,  0.,  3.,  3.,  3.,
        8.,  3.,  3.,  1.,  4.,  3.,  3.,  3.,  0.,  3.,  3.,  3.,  1.,
        8.,  3.,  3.,  3.])

In [59]:
num_samples_per_class = [sum(y_ == class_index) for class_index in range(n_classes)]
num_samples_per_class = np.array(num_samples_per_class)

In [61]:
num_samples_per_cluster = np.zeros(11)
for cluster_index in range(11):
    li = (class_clusters == cluster_index)
    num_samples_per_cluster[cluster_index] = sum(num_samples_per_class[li])

In [62]:
num_samples_per_cluster


Out[62]:
array([  1260.,   1287.,      0.,  23786.,   1215.,   1677.,      0.,
          872.,    239.,      0.,      0.])

Try to play around with number of classes


In [144]:
clusterer = sklearn.cluster.MiniBatchKMeans(n_clusters=11, max_iter=5000, batch_size=1500,
                                            compute_labels=True, random_state=42)

In [145]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))


Time=0.405395030975

In [146]:
n_classes = len(np.unique(y))
y_ = np.array(y)
class_clusters = np.ones((n_classes)) * -1

for class_index in range(n_classes):
    li = (y_ == class_index)
    class_clusters[class_index] = scipy.stats.mode(cluster_pred[li])[0]

In [147]:
class_clusters


Out[147]:
array([ 2.,  4.,  2.,  2.,  8.,  8.,  8.,  8.,  8.,  8.,  3.,  2.,  2.,
        2.,  8.,  8.,  2.,  8.,  8.,  2.,  8.,  8.,  8.,  1.,  8.,  8.,
        8.,  8.,  7.,  8.,  2.,  7.,  2.,  8.,  8.,  8.,  8.,  8.,  3.,
        8.,  2.,  8.,  8.,  2.,  8.,  9.,  1.,  2.,  2.,  8.,  2.,  7.,
        2.,  2.,  2.,  8.,  7.,  7.,  7.,  2.,  4.,  8.,  3.,  3.,  3.,
        2.,  2.,  2.,  4.,  8.,  8.,  2.,  7.,  2.,  3.,  7.,  2.,  8.,
        8.,  2.,  2.,  2.,  8.,  8.,  8.,  8.,  8.,  2.,  1.,  2.,  8.,
        3.,  2.,  2.,  3.,  8.,  2.,  3.,  8.,  2.,  2.,  2.,  2.,  2.,
        7.,  2.,  8.,  3.,  2.,  2.,  2.,  2.,  2.,  8.,  2.,  2.,  3.,
        7.,  8.,  8.,  8.])

In [148]:
n_clusters = len(np.unique(cluster_pred))
num_samples_per_cluster = np.zeros(n_clusters)
for cluster_index in range(n_clusters):
    li = (class_clusters == cluster_index)
    num_samples_per_cluster[cluster_index] = sum(num_samples_per_class[li])

In [149]:
num_samples_per_cluster


Out[149]:
array([     0.,     57.,  11615.,   1788.,    654.,      0.,      0.,
         1354.,  14841.,     27.,      0.])

Try Spectral clustering


In [151]:
clusterer = sklearn.cluster.SpectralClustering(n_clusters=8, random_state=42, n_init=10, n_neighbors=10)

In [152]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))


/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/manifold/spectral_embedding_.py:226: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
  warnings.warn("Graph is not fully connected, spectral embedding"
---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
<ipython-input-152-f8759f9194cd> in <module>()
      1 t0 = time.time()
----> 2 cluster_pred = clusterer.fit_predict(XF.squeeze(0))
      3 print("Time={}".format(time.time()-t0))

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/base.pyc in fit_predict(self, X, y)
    343         # non-optimized default implementation; override when a better
    344         # method is possible for a given clustering algorithm
--> 345         self.fit(X)
    346         return self.labels_
    347 

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/cluster/spectral.pyc in fit(self, X)
    445                                            n_init=self.n_init,
    446                                            eigen_tol=self.eigen_tol,
--> 447                                            assign_labels=self.assign_labels)
    448         return self
    449 

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/cluster/spectral.pyc in spectral_clustering(affinity, n_clusters, n_components, eigen_solver, random_state, n_init, eigen_tol, assign_labels)
    253                               eigen_solver=eigen_solver,
    254                               random_state=random_state,
--> 255                               eigen_tol=eigen_tol, drop_first=False)
    256 
    257     if assign_labels == 'kmeans':

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/manifold/spectral_embedding_.pyc in spectral_embedding(adjacency, n_components, eigen_solver, random_state, eigen_tol, norm_laplacian, drop_first, mode)
    301             X[:, 0] = dd.ravel()
    302             lambdas, diffusion_map = lobpcg(laplacian, X, tol=1e-15,
--> 303                                             largest=False, maxiter=2000)
    304             embedding = diffusion_map.T[:n_components] * dd
    305             if embedding.shape[0] == 1:

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/scipy/sparse/linalg/eigen/lobpcg/lobpcg.pyc in lobpcg(A, X, B, M, Y, tol, maxiter, largest, verbosityLevel, retLambdaHistory, retResidualNormsHistory)
    407         # B-orthonormalize the preconditioned residuals.
    408 
--> 409         aux = b_orthonormalize(B, activeBlockVectorR)
    410         activeBlockVectorR, activeBlockVectorBR = aux
    411 

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/scipy/sparse/linalg/eigen/lobpcg/lobpcg.pyc in b_orthonormalize(B, blockVectorV, blockVectorBV, retInvR)
    146             blockVectorBV = blockVectorV  # Shared data!!!
    147     gramVBV = sp.dot(blockVectorV.T, blockVectorBV)
--> 148     gramVBV = sla.cholesky(gramVBV)
    149     gramVBV = sla.inv(gramVBV, overwrite_a=True)
    150     # gramVBV is now R^{-1}.

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/scipy/linalg/decomp_cholesky.pyc in cholesky(a, lower, overwrite_a, check_finite)
     79     """
     80     c, lower = _cholesky(a, lower=lower, overwrite_a=overwrite_a, clean=True,
---> 81                             check_finite=check_finite)
     82     return c
     83 

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/scipy/linalg/decomp_cholesky.pyc in _cholesky(a, lower, overwrite_a, clean, check_finite)
     28     c, info = potrf(a1, lower=lower, overwrite_a=overwrite_a, clean=clean)
     29     if info > 0:
---> 30         raise LinAlgError("%d-th leading minor not positive definite" % info)
     31     if info < 0:
     32         raise ValueError('illegal value in %d-th argument of internal potrf'

LinAlgError: 4-th leading minor not positive definite

In [153]:
clusterer = sklearn.cluster.AgglomerativeClustering(n_clusters=8)

In [ ]:
t0 = time.time()
cluster_pred = clusterer.fit_predict(XF.squeeze(0))
print("Time={}".format(time.time()-t0))

In [ ]:
cluster_pred

In [ ]:

Logistic Regression


In [ ]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)

In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))

Linear SVC


In [ ]:
XF.squeeze(0)[:,0:1].shape

In [ ]:
len(y)

In [ ]:
# Try SCV on a single feature element from the vector

clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)[:,0:1]), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))

In [ ]:
# Naive Bayes on a single feature element

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)[:,0:1]), y, test_size=0.5, random_state=42)

print("Time={}".format(time.time()-t0))
t0 = time.time()

clf.fit(X_train, y_train)

print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))

In [ ]:
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))

Non-linear SVC

one-vs-one


In [ ]:
clf = sklearn.svm.SVC(probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))